## ── Attaching packages ────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1     ✔ purrr   0.2.4
## ✔ tibble  1.4.2     ✔ dplyr   0.7.4
## ✔ tidyr   0.8.0     ✔ stringr 1.3.0
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
## Loading required package: magrittr
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract

We must filter our BLASTn quantification results by BGC coverage criteria (>=50).

#load in quantifier data 
quantifier_df <- read_delim("combined-MetaHIT_Gut-spanish-quantifier-results.txt", col_names = F, delim = "\t")
## Parsed with column specification:
## cols(
##   X1 = col_character(),
##   X2 = col_double(),
##   X3 = col_double(),
##   X4 = col_character()
## )
names(quantifier_df) <- c("bgcName", "Coverage", "RPKM", "Sample") # add column names

#Filter data by coverage 
filter_quantifier_df <- quantifier_df %>% filter(Coverage >= 50)
sample_metadata <- read_csv("sample_metadata.csv", col_names = T) # load metadata
## Parsed with column specification:
## cols(
##   Sample = col_character(),
##   GroupAttribute = col_character()
## )
#add sample metadata to filtered quantifier dataframe 
filter_quantifier_df <- filter_quantifier_df %>% inner_join(., sample_metadata, by = "Sample")
# retrieve BGC class from bgcName 
results_df <- filter_quantifier_df %>% separate(., bgcName,c("sampleID", "scaffoldName", "clusterlen", "bgcClass", "software", "Coord"),sep = "__", remove = F) %>% select(-c(2:4,6:7))

############################
#Group BGC Class for figure 
recode_bgcClass<-function(df){
  PKS <- c("t2pks","transatpks","t1pks","lantipeptide-t2pks","otherks","otherks-transatpks")
  NRPS <- c("nrps","nrps-bacteriocin","nrps-ladderane","ladderane-nrps","nrps-lantipeptide","arylpolyene-nrps")
  Terpene <- "terpene"
  hybrid_pks_nrps<- c("nrps-t1pks","transatpks-otherks-nrps")
  Others <- c("arylpolyene","other", "resorcinol","butyrolactone", "ladderane", "siderophore", "hserlactone", "bacteriocin-arylpolyene","amglyccycl","ladderane-arylpolyene")
  RiPPs <- c("microcin","bacteriocin","lantipeptide","lassopeptide","bacteriocin-lantipeptide","sactipeptide", "thiopeptide","bacteriocin-proteusin", "glycocin","microcin-lassopeptide")
  
  df$bgcClass[df$bgcClass %in% PKS ==TRUE] <- "PKS"
  df$bgcClass[df$bgcClass %in% NRPS ==TRUE] <- "NRPS"
  df$bgcClass[df$bgcClass == Terpene] <- "Terpene"
  df$bgcClass[df$bgcClass %in% hybrid_pks_nrps ==TRUE] <- "Hybrid PKS-NRPS"
  df$bgcClass[df$bgcClass %in% Others ==TRUE] <- "Others"
  df$bgcClass[df$bgcClass %in% RiPPs ==TRUE] <- "RiPPs"
  return(df)
  
}

spanish_filtered_results<-recode_bgcClass(results_df)

#############################
#load in uniprot species data collected from our species profiler
species_df <- read_csv("species-profiler-results/species_results-MetaHIT-spanish_UniProt.csv", col_names = T)
## Parsed with column specification:
## cols(
##   BGC_NAME = col_character(),
##   TAXA_NAME = col_character(),
##   ACC_ID = col_character(),
##   PERC_IDENT = col_double(),
##   COVERAGE = col_integer(),
##   ORGANISM = col_character(),
##   PHYLUM = col_character(),
##   Manual_PHYLUM = col_character(),
##   Manual_ORGANISM = col_character()
## )
species_df[is.na(species_df)]<-"N/A"
species_df$PHYLUM <- ifelse(species_df$PHYLUM == "N/A", species_df$Manual_PHYLUM, species_df$PHYLUM)
species_df$ORGANISM <- ifelse(species_df$ORGANISM == "N/A", species_df$Manual_ORGANISM, species_df$ORGANISM)

species_df$PHYLUM[species_df$PHYLUM == "N/A" ] <-"Unassigned"

species_df_bgc_class <-species_df %>% separate(., BGC_NAME,c("sampleID", "scaffoldName", "clusterlen", "bgcClass", "software", "Coord"),sep = "__", remove = F) %>% select(-c(2:4,6:7))
species_recoded_bgc_results<-recode_bgcClass(species_df_bgc_class)

#Plot BGC counts per Phyla detected 
species_recoded_bgc_results %>% group_by(PHYLUM,bgcClass) %>% count() %>% ggplot(.,aes(reorder(PHYLUM, n), n)) + geom_col(aes(fill = bgcClass)) + scale_fill_npg() + coord_flip()+ theme_bw() 

#ggsave("BGC-counts-perclass_Phyla-barplot.eps", width = 7.3, height = 6.31, units = "in", device = "eps") 
species_recoded_bgc_results %>% group_by(PHYLUM) %>% count() %>% ggplot(.,aes(reorder(PHYLUM, n), n)) + geom_col() + scale_fill_npg() + coord_flip()+ theme_bw() + xlab("Phyla") + ylab("Total BGCs detected") +geom_text(aes(label=n), vjust=0,hjust=-.5)

#ggsave("BGC-counts-perPhyla-barplot.eps", width =11, height = 6.31, units = "in", device = "eps") 

Summary statistics for filtered data per cohort.

### calculate hit ratio for each BGC within each cohort 
calculate_hitRatio <-function(data, metadata){
  metadata_counts<- metadata %>% group_by(GroupAttribute) %>% count() %>% ungroup()
  bgc_counts <- data %>% group_by(bgcName,GroupAttribute) %>% count() %>% ungroup()
  hitRatio_df <- data.frame() # intialize results df 
  for (i in 1:nrow(metadata_counts)){
    diseaseStatus <- metadata_counts[i,]$GroupAttribute
    diseaseStatus_count <- metadata_counts[i,]$n
    disease_hitratio <- bgc_counts %>% filter(GroupAttribute == diseaseStatus) %>%  mutate(., hitRatio = case_when(GroupAttribute == diseaseStatus ~ (n/diseaseStatus_count)*100))
    hitRatio_df<-rbind(hitRatio_df,disease_hitratio)
    
  }
  return(hitRatio_df)
}

spanish_hitratio <- calculate_hitRatio(spanish_filtered_results, sample_metadata)

################################################
#Quanitfication of the exact CB and CC clusters for MetaHIT 
#rainin_df <- read_delim("combined-MetaHIT_Gut-rainin-quantifier-results.txt", delim = "\t",col_names = FALSE)%>%filter(X2 >= 50) %>% inner_join(., sample_metadata, by = c("X4"="Sample"))
#names(rainin_df)[1:4] <- c("bgcName", "Coverage", "RPKM", "Sample") # add column names
######################
#Quanitfication of the exact CB and CC clusters for MetaHIT  with unique reads mapped to each clusters
rainin_df <- read_delim("smNRPS-exact-unique-reads/combined-MetaHIT_Gut-rainin-exact-unique-reads-quantifier-results.txt", delim = "\t",col_names = FALSE)%>%filter(X2 >= 50) %>% inner_join(., sample_metadata, by = c("X4"="Sample"))
## Parsed with column specification:
## cols(
##   X1 = col_character(),
##   X2 = col_double(),
##   X3 = col_double(),
##   X4 = col_character()
## )
names(rainin_df)[1:4] <- c("bgcName", "Coverage", "RPKM", "Sample") # add column names

###########################
#rainin_df <- read_delim("smNRPS-exact-unique-reads-updated/combined-MetaHIT_Gut-rainin-exact-unique-reads-updated-quantifier-results.txt", delim = "\t",col_names = FALSE)%>%filter(X2 >= 50) %>% inner_join(., sample_metadata, by = c("X4"="Sample"))
#names(rainin_df)[1:4] <- c("bgcName", "Coverage", "RPKM", "Sample") # add column names

#####################################
rainin_df_hitratio <- calculate_hitRatio(rainin_df, sample_metadata)

prepare_rainin_data <- function(df, metadata_df){
  missingsamples<- metadata_df %>% filter(!Sample %in% df$Sample)
  df_wide <- dcast(df, bgcName ~ Sample, value.var="RPKM") 
  i<-1
  for (i in 1:nrow(missingsamples)){
    sample_col <- missingsamples[i,]$Sample
    df_wide[, sample_col]<-NA
    i<- i+1 
  
  }
  df_wide[is.na(df_wide)] <- 0 
  df_melt <- melt(df_wide) %>%  mutate_if(is.factor, as.character) %>% inner_join(.,metadata_df, by = c("variable"="Sample") )
  names(df_melt)[2]<-"Sample"
  names(df_melt)[3]<-"RPKM"
  return(df_melt)
  
}

rainin_df_update<- prepare_rainin_data(rainin_df, sample_metadata)
## Using bgcName as id variables
rainin_df_update %>% filter(bgcName !="smNRPS-BP") %>% group_by(GroupAttribute,bgcName) %>% mutate(ymean= mean(RPKM)) %>% mutate(ymedian= median(RPKM)) %>%  ggplot(., aes(Sample,RPKM)) + geom_col()+geom_hline(aes(yintercept=ymedian), color="red", linetype="dashed")+geom_hline(aes(yintercept=ymean), color="red", linetype="solid")+theme_pubclean()+ facet_grid(bgcName~GroupAttribute, switch= "x",scales = "free_x", space = "free_x") + xlab("Disease")+  theme(
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) + theme(plot.title = element_text(hjust = 0.5,face = "bold",size = "24"))

#ggsave("combined-smNRPS-MetaHIT-barplot.eps", width = 11, height = 5, units = "in", device = "eps") 
#ggsave("combined-smNRPS-exact-unique-reads-MetaHIT-barplot.eps", width = 11, height = 5, units = "in", device = "eps") 

#ggsave("combined-smNRPS-exact-unique-reads-MetaHIT-update-barplot.eps", width = 11, height = 5, units = "in", device = "eps") 


rainin_df_update %>% filter(bgcName !="smNRPS-BP") %>% group_by(GroupAttribute) %>% mutate(ymean= mean(RPKM)) %>% mutate(ymedian= median(RPKM)) %>% ggplot(., aes(Sample,RPKM)) + geom_bar(stat = "identity", aes(fill=bgcName))+ theme_pubclean()+geom_hline(aes(yintercept=ymedian), color="black", linetype="dashed")+geom_hline(aes(yintercept=ymean), color="black", linetype="solid")+ facet_grid(~GroupAttribute, switch= "x",scales = "free_x", space = "free_x") + xlab("Disease")+  theme(
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) + theme(plot.title = element_text(hjust = 0.5,face = "bold",size = "24")) + scale_fill_npg() + coord_cartesian(ylim=c(0, 20))

#ggsave("combined-smNRPS-exact-unique-reads-MetaHIT-barplot-combined-max20.eps", width = 11, height = 5, units = "in", device = "eps") 

rainin_df_update %>% filter(bgcName !="smNRPS-BP") %>% group_by(GroupAttribute) %>% mutate(ymean= mean(RPKM)) %>% mutate(ymedian= median(RPKM)) %>% ggplot(., aes(Sample,RPKM)) + geom_bar(stat = "identity", aes(fill=bgcName))+ theme_pubclean()+geom_hline(aes(yintercept=ymedian), color="black", linetype="dashed")+geom_hline(aes(yintercept=ymean), color="black", linetype="solid")+ facet_grid(~GroupAttribute, switch= "x",scales = "free_x", space = "free_x") + xlab("Disease")+  theme(
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) + theme(plot.title = element_text(hjust = 0.5,face = "bold",size = "24")) + scale_fill_npg() 

#ggsave("combined-smNRPS-exact-unique-reads-MetaHIT-barplot-combined.eps", width = 11, height = 5, units = "in", device = "eps") 
#######################################################################################################

bar_plot_per_cluster<- function(df, cluster){
  plot_title <- paste0(cluster, "-","barplot",".eps")
  df %>% filter(bgcName == cluster) %>% group_by(GroupAttribute) %>% mutate(ymean= mean(RPKM)) %>% mutate(ymedian= median(RPKM)) %>%  ggplot(., aes(Sample,RPKM)) + geom_col()+geom_hline(aes(yintercept=ymedian), color="red", linetype="dashed")+geom_hline(aes(yintercept=ymean), color="red", linetype="solid")+theme_pubclean()+ facet_grid(~GroupAttribute, switch= "x",scales = "free_x", space = "free_x") + xlab("Disease")+  theme(
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) + ggtitle(cluster) + theme(plot.title = element_text(hjust = 0.5,face = "bold",size = "24"))
  
   #ggsave(plot_title, width = 11, height = 5, units = "in", device = "eps") 

}

bar_plot_per_cluster(rainin_df_update, "smNRPS-CC" )

bar_plot_per_cluster(rainin_df_update, "smNRPS-CB" )

#bar_plot_per_cluster(rainin_df_update, "smNRPS-BP" )


################################################
#smNRPS-CB:V1.UC61.0__NODE_930_length_26002_cov_3.77986__26002__nrps__ANTISMASH__0_26002 
spanish_hitratio %>% filter(bgcName == "V1.UC61.0__NODE_930_length_26002_cov_3.77986__26002__nrps__ANTISMASH__0_26002")
## # A tibble: 3 x 4
##   bgcName                                    GroupAttribute     n hitRatio
##   <chr>                                      <chr>          <int>    <dbl>
## 1 V1.UC61.0__NODE_930_length_26002_cov_3.77… C                  5     7.04
## 2 V1.UC61.0__NODE_930_length_26002_cov_3.77… CD                13    61.9 
## 3 V1.UC61.0__NODE_930_length_26002_cov_3.77… UC                12     9.45
#smNRPS-CC:V1.CD18.3__NODE_78_length_114577_cov_12.9704__43190__nrps__ANTISMASH__49316_92506
spanish_hitratio %>% filter(bgcName == "V1.CD18.3__NODE_78_length_114577_cov_12.9704__43190__nrps__ANTISMASH__49316_92506")
## # A tibble: 3 x 4
##   bgcName                                    GroupAttribute     n hitRatio
##   <chr>                                      <chr>          <int>    <dbl>
## 1 V1.CD18.3__NODE_78_length_114577_cov_12.9… C                  3     4.23
## 2 V1.CD18.3__NODE_78_length_114577_cov_12.9… CD                17    81.0 
## 3 V1.CD18.3__NODE_78_length_114577_cov_12.9… UC                 5     3.94
suppressWarnings(describeBy(spanish_hitratio, spanish_hitratio$GroupAttribute)) # #Stats for Hit Ratio
## 
##  Descriptive statistics by group 
## group: C
##                 vars   n  mean    sd median trimmed   mad  min   max range
## bgcName*           1 818   NaN    NA     NA     NaN    NA  Inf  -Inf  -Inf
## GroupAttribute*    2 818   NaN    NA     NA     NaN    NA  Inf  -Inf  -Inf
## n                  3 818 15.53 17.25   8.00   12.26  8.90 1.00 70.00 69.00
## hitRatio           4 818 21.88 24.29  11.27   17.26 12.53 1.41 98.59 97.18
##                 skew kurtosis   se
## bgcName*          NA       NA   NA
## GroupAttribute*   NA       NA   NA
## n               1.43     1.01 0.60
## hitRatio        1.43     1.01 0.85
## -------------------------------------------------------- 
## group: CD
##                 vars   n  mean    sd median trimmed   mad  min   max range
## bgcName*           1 473   NaN    NA     NA     NaN    NA  Inf  -Inf  -Inf
## GroupAttribute*    2 473   NaN    NA     NA     NaN    NA  Inf  -Inf  -Inf
## n                  3 473  4.92  4.53   3.00    4.07  2.97 1.00 20.00 19.00
## hitRatio           4 473 23.44 21.58  14.29   19.37 14.12 4.76 95.24 90.48
##                 skew kurtosis   se
## bgcName*          NA       NA   NA
## GroupAttribute*   NA       NA   NA
## n               1.43     1.42 0.21
## hitRatio        1.43     1.42 0.99
## -------------------------------------------------------- 
## group: UC
##                 vars   n  mean    sd median trimmed   mad  min    max
## bgcName*           1 879   NaN    NA     NA     NaN    NA  Inf   -Inf
## GroupAttribute*    2 879   NaN    NA     NA     NaN    NA  Inf   -Inf
## n                  3 879 23.68 28.79  10.00   17.72 11.86 1.00 120.00
## hitRatio           4 879 18.65 22.67   7.87   13.95  9.34 0.79  94.49
##                 range skew kurtosis   se
## bgcName*         -Inf   NA       NA   NA
## GroupAttribute*  -Inf   NA       NA   NA
## n               119.0 1.58     1.51 0.97
## hitRatio         93.7 1.58     1.51 0.76
suppressWarnings(describeBy(spanish_filtered_results, spanish_filtered_results$GroupAttribute)) # Stats for RPKM 
## 
##  Descriptive statistics by group 
## group: C
##                 vars     n  mean    sd median trimmed   mad   min    max
## bgcName*           1 12707   NaN    NA     NA     NaN    NA   Inf   -Inf
## bgcClass*          2 12707   NaN    NA     NA     NaN    NA   Inf   -Inf
## Coverage           3 12707 77.28 15.47  77.95   77.60 20.88 50.00 100.00
## RPKM               4 12707  3.34  5.72   1.49    2.12  1.48  0.06 126.82
## Sample*            5 12707   NaN    NA     NA     NaN    NA   Inf   -Inf
## GroupAttribute*    6 12707   NaN    NA     NA     NaN    NA   Inf   -Inf
##                  range  skew kurtosis   se
## bgcName*          -Inf    NA       NA   NA
## bgcClass*         -Inf    NA       NA   NA
## Coverage         50.00 -0.12    -1.29 0.14
## RPKM            126.76  6.15    67.98 0.05
## Sample*           -Inf    NA       NA   NA
## GroupAttribute*   -Inf    NA       NA   NA
## -------------------------------------------------------- 
## group: CD
##                 vars    n  mean    sd median trimmed   mad   min    max
## bgcName*           1 2328   NaN    NA     NA     NaN    NA   Inf   -Inf
## bgcClass*          2 2328   NaN    NA     NA     NaN    NA   Inf   -Inf
## Coverage           3 2328 78.61 15.67  79.92   79.18 20.53 50.01 100.00
## RPKM               4 2328  7.18 12.03   2.32    4.33  2.66  0.08 115.83
## Sample*            5 2328   NaN    NA     NA     NaN    NA   Inf   -Inf
## GroupAttribute*    6 2328   NaN    NA     NA     NaN    NA   Inf   -Inf
##                  range  skew kurtosis   se
## bgcName*          -Inf    NA       NA   NA
## bgcClass*         -Inf    NA       NA   NA
## Coverage         49.99 -0.21    -1.26 0.32
## RPKM            115.75  3.35    15.25 0.25
## Sample*           -Inf    NA       NA   NA
## GroupAttribute*   -Inf    NA       NA   NA
## -------------------------------------------------------- 
## group: UC
##                 vars     n  mean    sd median trimmed   mad   min    max
## bgcName*           1 20818   NaN    NA     NA     NaN    NA   Inf   -Inf
## bgcClass*          2 20818   NaN    NA     NA     NaN    NA   Inf   -Inf
## Coverage           3 20818 77.63 15.53  78.31   77.99 21.02 50.00 100.00
## RPKM               4 20818  3.54  7.04   1.65    2.27  1.67  0.09 436.45
## Sample*            5 20818   NaN    NA     NA     NaN    NA   Inf   -Inf
## GroupAttribute*    6 20818   NaN    NA     NA     NaN    NA   Inf   -Inf
##                  range  skew kurtosis   se
## bgcName*          -Inf    NA       NA   NA
## bgcClass*         -Inf    NA       NA   NA
## Coverage         50.00 -0.13    -1.30 0.11
## RPKM            436.36 22.90  1187.34 0.05
## Sample*           -Inf    NA       NA   NA
## GroupAttribute*   -Inf    NA       NA   NA
spanish_filtered_results %>% group_by(GroupAttribute) %>% count(Sample) %>% describeBy(., .$GroupAttribute)# Stats for Total BGCs in each sample
## Warning in FUN(data[x, , drop = FALSE], ...): NAs introduced by coercion

## Warning in FUN(data[x, , drop = FALSE], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(data[x, , drop = FALSE], ...): NAs introduced by coercion

## Warning in FUN(data[x, , drop = FALSE], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(data[x, , drop = FALSE], ...): NAs introduced by coercion

## Warning in FUN(data[x, , drop = FALSE], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## 
##  Descriptive statistics by group 
## group: C
##                 vars  n   mean    sd median trimmed   mad min  max range
## GroupAttribute*    1 71    NaN    NA     NA     NaN    NA Inf -Inf  -Inf
## Sample*            2 71    NaN    NA     NA     NaN    NA Inf -Inf  -Inf
## n                  3 71 178.97 71.98    186  184.68 65.23   2  315   313
##                  skew kurtosis   se
## GroupAttribute*    NA       NA   NA
## Sample*            NA       NA   NA
## n               -0.71    -0.02 8.54
## -------------------------------------------------------- 
## group: CD
##                 vars  n   mean    sd median trimmed   mad min  max range
## GroupAttribute*    1 21    NaN    NA     NA     NaN    NA Inf -Inf  -Inf
## Sample*            2 21    NaN    NA     NA     NaN    NA Inf -Inf  -Inf
## n                  3 21 110.86 53.26    113  110.06 54.86  12  214   202
##                 skew kurtosis    se
## GroupAttribute*   NA       NA    NA
## Sample*           NA       NA    NA
## n               0.09    -0.88 11.62
## -------------------------------------------------------- 
## group: UC
##                 vars   n   mean    sd median trimmed   mad min  max range
## GroupAttribute*    1 127    NaN    NA     NA     NaN    NA Inf -Inf  -Inf
## Sample*            2 127    NaN    NA     NA     NaN    NA Inf -Inf  -Inf
## n                  3 127 163.92 72.48    179  168.23 62.27   2  297   295
##                  skew kurtosis   se
## GroupAttribute*    NA       NA   NA
## Sample*            NA       NA   NA
## n               -0.59    -0.59 6.43
# Try to order facets.
reorder_within <- function(x, by, within, fun = mean, sep = "___", ...) {
  new_x <- paste(x, within, sep = sep)
  stats::reorder(new_x, by, FUN = fun)
}


scale_x_reordered <- function(..., sep = "___") {
  reg <- paste0(sep, ".+$")
  ggplot2::scale_x_discrete(labels = function(x) gsub(reg, "", x), ...)
}


scale_y_reordered <- function(..., sep = "___") {
  reg <- paste0(sep, ".+$")
  ggplot2::scale_y_discrete(labels = function(x) gsub(reg, "", x), ...)
}
######## 
#figure for BGC cclass total molecules detected in each samples in each cohort 
spanish_filtered_results %>% group_by(GroupAttribute,bgcClass) %>% count(Sample) %>% ggplot(.,aes(reorder_within(Sample, n, GroupAttribute), n)) + geom_col(aes(fill = bgcClass)) + scale_fill_npg()+ facet_grid(~GroupAttribute, switch = "x", scales = "free_x", space = "free_x") + scale_x_reordered()+
    theme(panel.spacing = unit(0, "lines"), 
         strip.background = element_blank()) + xlab("Disease Status")+ ylab("Total BGCs detected") + theme_bw() + theme(axis.text.x = element_blank(),axis.ticks.x = element_blank())

#########

wilcoxon_test<-function(df, cohort, testType){
  df$GroupAttribute <-as.factor(df$GroupAttribute)
  print(levels(df$GroupAttribute))
  results<-df  %>% group_by(GroupAttribute) %>% count(Sample) %>% wilcox.test(n ~ GroupAttribute, data=., paired=F, alternative = testType,subset = GroupAttribute %in% cohort )
  print(results)
}

#levels = "C"  "CD" "UC"
wilcoxon_test(spanish_filtered_results, c("C", "CD"), "greater") # twosided=8.806e-05/ 4.403e-05
## [1] "C"  "CD" "UC"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  n by GroupAttribute
## W = 1167.5, p-value = 4.403e-05
## alternative hypothesis: true location shift is greater than 0
wilcoxon_test(spanish_filtered_results, c("CD", "UC"), "less") # twosided=0.0008254,0.0004127
## [1] "C"  "CD" "UC"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  n by GroupAttribute
## W = 724.5, p-value = 0.0004127
## alternative hypothesis: true location shift is less than 0
wilcoxon_test(spanish_filtered_results, c("C", "UC"), "greater")#twosided=0.175, 0.08748
## [1] "C"  "CD" "UC"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  n by GroupAttribute
## W = 5033.5, p-value = 0.08748
## alternative hypothesis: true location shift is greater than 0
############
#Determine the mean BGCs detected in each cohort pairs 
spanish_filtered_results %>% filter(GroupAttribute %in% c("C", "CD")) %>% count(Sample) %>% inner_join(., sample_metadata) %>% group_by(GroupAttribute) %>% mutate(meanBGC = mean(n)) %>% distinct(meanBGC, GroupAttribute)
## Joining, by = "Sample"
## # A tibble: 2 x 2
## # Groups:   GroupAttribute [2]
##   GroupAttribute meanBGC
##   <chr>            <dbl>
## 1 C                 179.
## 2 CD                111.
spanish_filtered_results %>% filter(GroupAttribute %in% c("C", "UC")) %>% count(Sample) %>% inner_join(., sample_metadata) %>% group_by(GroupAttribute) %>% mutate(meanBGC = mean(n)) %>% distinct(meanBGC, GroupAttribute)
## Joining, by = "Sample"
## # A tibble: 2 x 2
## # Groups:   GroupAttribute [2]
##   GroupAttribute meanBGC
##   <chr>            <dbl>
## 1 UC                164.
## 2 C                 179.
spanish_filtered_results %>% filter(GroupAttribute == "C" | GroupAttribute == "CD") %>% group_by(GroupAttribute) %>% count(Sample)%>% ggplot(., aes(GroupAttribute, n, color = GroupAttribute)) + geom_boxplot(outlier.colour = NA) +  geom_jitter(width = 0.2)+stat_summary(fun.y = mean, geom = "errorbar", aes(ymax = ..y.., ymin = ..y..), width = .75, linetype = "dashed")+scale_color_npg()+ theme_classic() +annotate("text", x = 1.5, y = 300, label = "Wilcoxon, p = 4.403e-05")+stat_compare_means(comparisons = list( c("C", "CD")),label = "p.signif", paired =F) 

spanish_filtered_results %>% filter(GroupAttribute == "UC" | GroupAttribute == "CD") %>% group_by(GroupAttribute) %>% count(Sample)%>% ggplot(., aes(GroupAttribute, n, color = GroupAttribute)) + geom_boxplot(outlier.colour = NA) +  geom_jitter(width = 0.2)+stat_summary(fun.y = mean, geom = "errorbar", aes(ymax = ..y.., ymin = ..y..), width = .75, linetype = "dashed")+scale_color_npg()+ theme_classic() +annotate("text", x = 1.5, y = 300, label = "Wilcoxon, p = 0.0004127")+stat_compare_means(comparisons = list( c("UC", "CD")),label = "p.signif", paired =F) 

spanish_filtered_results %>% filter(GroupAttribute == "UC" | GroupAttribute == "C") %>% group_by(GroupAttribute) %>% count(Sample)%>% ggplot(., aes(GroupAttribute, n, color = GroupAttribute)) + geom_boxplot(outlier.colour = NA) +  geom_jitter(width = 0.2)+stat_summary(fun.y = mean, geom = "errorbar", aes(ymax = ..y.., ymin = ..y..), width = .75, linetype = "dashed")+scale_color_npg()+ theme_classic() +annotate("text", x = 1.5, y = 300, label = "Wilcoxon, p = 0.08748")+stat_compare_means(comparisons = list( c("UC", "C")),label = "p.format", paired =F) 

Analysis for Crohns vs Controls: iterative KW test; LDA analysis; Random forest

# Function to add BGCs that zeros when a BGC was not found in a Sample 
prepare_data<-function(df, metadata_df){
  data_wide <- dcast(df, bgcName ~ Sample, value.var="RPKM") 
  data_wide[is.na(data_wide)] <- 0 
  data_melt <- melt(data_wide) %>%  mutate_if(is.factor, as.character) %>% inner_join(.,metadata_df, by = c("variable"="Sample") )
  names(data_melt)[2]<-"Sample"
  names(data_melt)[3]<-"RPKM"
  return(data_melt)
  
}

filter_quantifier_df_updated <- prepare_data(filter_quantifier_df, sample_metadata) 
## Using bgcName as id variables
quantifier_CD_C <-filter_quantifier_df_updated %>% filter(GroupAttribute == "CD" | GroupAttribute == "C") 
quantifier_UC_C <-filter_quantifier_df_updated %>% filter(GroupAttribute == "UC" | GroupAttribute == "C") 
quantifier_UC_CD <-filter_quantifier_df_updated %>% filter(GroupAttribute == "UC" | GroupAttribute == "CD") 

################
#Rainin not exact clusters barplot
bar_plot_per_cluster(filter_quantifier_df_updated, "V1.UC61.0__NODE_930_length_26002_cov_3.77986__26002__nrps__ANTISMASH__0_26002" ) #smNRPS CB 

bar_plot_per_cluster(filter_quantifier_df_updated, "V1.CD18.3__NODE_78_length_114577_cov_12.9704__43190__nrps__ANTISMASH__49316_92506" ) #smNRPS CC 

###############
#Enriched CD E.coli dervived clusters 
bar_plot_per_cluster(filter_quantifier_df_updated, "V1.UC61.0__NODE_228_length_65988_cov_4.67854__26289__nrps__ANTISMASH__39699_65988" )

bar_plot_per_cluster(filter_quantifier_df_updated, "V1.UC61.0__NODE_2273_length_12464_cov_2.76678__12464__t1pks__ANTISMASH__0_12464" )

bar_plot_per_cluster(filter_quantifier_df_updated,"V1.CD41.0__NODE_43_length_139509_cov_97.9643__53525__nrps__ANTISMASH__80787_134312")

bar_plot_per_cluster(filter_quantifier_df_updated,"V1.UC5.0__NODE_2999_length_8272_cov_2.30449__8272__bacteriocin__ANTISMASH__0_8272")

bar_plot_per_cluster(filter_quantifier_df_updated,"V1.CD35.1__NODE_3185_length_6846_cov_4.30231__6846__siderophore__ANTISMASH__0_6846")

bar_plot_per_cluster(filter_quantifier_df_updated,"V1.CD10.0__NODE_453_length_17115_cov_2.32239__17115__siderophore__ANTISMASH__0_17115")

bar_plot_per_cluster(filter_quantifier_df_updated,"V1.CD18.3__NODE_40_length_166683_cov_10.5209__59165__nrps-t1pks__ANTISMASH__61574_120739")

ecoli_bgcs<-c("V1.UC61.0__NODE_228_length_65988_cov_4.67854__26289__nrps__ANTISMASH__39699_65988","V1.UC61.0__NODE_2273_length_12464_cov_2.76678__12464__t1pks__ANTISMASH__0_12464","V1.CD41.0__NODE_43_length_139509_cov_97.9643__53525__nrps__ANTISMASH__80787_134312","V1.UC5.0__NODE_2999_length_8272_cov_2.30449__8272__bacteriocin__ANTISMASH__0_8272", "V1.CD35.1__NODE_3185_length_6846_cov_4.30231__6846__siderophore__ANTISMASH__0_6846", "V1.CD10.0__NODE_453_length_17115_cov_2.32239__17115__siderophore__ANTISMASH__0_17115", "V1.CD18.3__NODE_40_length_166683_cov_10.5209__59165__nrps")

ecoli_bgc_data<- filter_quantifier_df %>% filter(bgcName %in% ecoli_bgcs) 

#################

#Format dataframe to run lefse 
source("format_data_for_lefse.R")

quantifier_CD_C_lefse<-format_data_for_lefse(quantifier_CD_C)
quantifier_UC_C_lefse<-format_data_for_lefse(quantifier_UC_C)
quantifier_UC_CD_lefse<-format_data_for_lefse(quantifier_UC_CD)

#write_delim(quantifier_CD_C_lefse, "quantifier-results-CD_C-spanish-for-lefse.txt",delim="\t")
#write_delim(quantifier_UC_C_lefse, "quantifier-results-UC_C-spanish-for-lefse.txt",delim="\t")
#write_delim(quantifier_UC_CD_lefse, "quantifier-results-UC_CD-spanish-for-lefse.txt",delim="\t")
# Analysis with pvalue <= .01; LDA Score >2 
########################################################
lefse_CD_C <- read_delim("LEfSe-results/LDA_Effect_Size_CD_C_stricter_pvalue.txt",col_names = F, delim = "\t")
## Parsed with column specification:
## cols(
##   X1 = col_character(),
##   X2 = col_double(),
##   X3 = col_character(),
##   X4 = col_double(),
##   X5 = col_character()
## )
colnames(lefse_CD_C)<-c("BGC", "log(HighestMean_AllClasses)","class_discriminitive","LDA_score", "pvalue")

lefse_CD_C_discriminative_features <- lefse_CD_C %>% filter(!is.na(class_discriminitive) & !is.na(LDA_score))


lefse_CD_C_discriminative_features<-lefse_CD_C_discriminative_features %>% mutate(LDA_score = ifelse(class_discriminitive == "C", -abs(LDA_score), LDA_score))
                                    
lefse_CD_C_discriminative_features %>%  mutate(BGC = reorder(BGC, LDA_score)) %>%
  ggplot(aes(BGC, LDA_score, fill = class_discriminitive)) +
  geom_bar(stat = "identity")  +
  ylab("LDA SCORE (log 10)") +
  coord_flip() + theme_bw() + scale_fill_manual(values = c("#DC0000FF", "#00A087FF")) + theme(legend.position="top") +theme(legend.title=element_blank())

 #ggsave("lefse_CD_C_discriminative_features.eps", width = 10, height = 15, units = "in", device = "eps") 

########################################################
 
lefse_UC_C <- read_delim("LEfSe-results/LDA_Effect_Size_UC_C_stricter_pvalue.txt",col_names = F, delim = "\t")
## Parsed with column specification:
## cols(
##   X1 = col_character(),
##   X2 = col_double(),
##   X3 = col_character(),
##   X4 = col_double(),
##   X5 = col_character()
## )
colnames(lefse_UC_C)<-c("BGC", "log(HighestMean_AllClasses)","class_discriminitive","LDA_score", "pvalue")

lefse_UC_C_discriminative_features <- lefse_UC_C %>% filter(!is.na(class_discriminitive) & !is.na(LDA_score))


lefse_UC_C_discriminative_features<-lefse_UC_C_discriminative_features %>% mutate(LDA_score = ifelse(class_discriminitive == "C", -abs(LDA_score), LDA_score))
                                    
lefse_UC_C_discriminative_features %>%  mutate(BGC = reorder(BGC, LDA_score)) %>%
  ggplot(aes(BGC, LDA_score, fill = class_discriminitive)) +
  geom_bar(stat = "identity")  +
  ylab("LDA SCORE (log 10)") +
  coord_flip() + theme_bw() + scale_fill_manual(values = c("#DC0000FF", "#00A087FF")) + theme(legend.position="top") +theme(legend.title=element_blank())

#ggsave("lefse_UC_C_discriminative_features.eps", width = 10, height = 15, units = "in", device = "eps") 



########################################################
lefse_UC_CD <- read_delim("LEfSe-results/LDA_Effect_Size_UC_CD_stricter_pvalue.txt",col_names = F, delim = "\t")
## Parsed with column specification:
## cols(
##   X1 = col_character(),
##   X2 = col_double(),
##   X3 = col_character(),
##   X4 = col_double(),
##   X5 = col_character()
## )
colnames(lefse_UC_CD)<-c("BGC", "log(HighestMean_AllClasses)","class_discriminitive","LDA_score", "pvalue")

lefse_UC_CD_discriminative_features <- lefse_UC_CD %>% filter(!is.na(class_discriminitive) & !is.na(LDA_score))


lefse_UC_CD_discriminative_features<-lefse_UC_CD_discriminative_features %>% mutate(LDA_score = ifelse(class_discriminitive == "CD", -abs(LDA_score), LDA_score))
                                    
lefse_UC_CD_discriminative_features %>%  mutate(BGC = reorder(BGC, LDA_score)) %>%
  ggplot(aes(BGC, LDA_score, fill = class_discriminitive)) +
  geom_bar(stat = "identity")  +
  ylab("LDA SCORE (log 10)") +
  coord_flip() + theme_bw() + scale_fill_manual(values = c("#DC0000FF", "#00A087FF")) + theme(legend.position="top") +theme(legend.title=element_blank())

#ggsave("lefse_UC_CD_discriminative_features.eps", width = 10, height = 10, units = "in", device = "eps") 
#Venn diagram 
# 
# lefse_CD_C_discriminative_features
# lefse_UC_C_discriminative_features
# lefse_UC_CD_discriminative_features
# 
# enrichedUC_C_list<-lefse_UC_C_discriminative_features%>% filter(class_discriminitive == "UC")
# enrichedUC_CD_list<-lefse_UC_CD_discriminative_features%>% filter(class_discriminitive == "UC")
# gplots::venn(list(enrichedUC_CD = enrichedUC_CD_list$BGC, enrichedUC_C= enrichedUC_C_list$BGC))
#iterative KW similarly to Lefse to retreive p-values for all BGCs and compare to Lefse results 
#normalized by LEfse methods 
iterativeKW<-function(df, features){
  df<- df %>% group_by(Sample) %>% mutate(sumRPKM = sum(RPKM)) %>% mutate(normalizedRPKM = (RPKM/sumRPKM) * 1000000)
  df$GroupAttribute <-as.factor(df$GroupAttribute)
  features_pvalues<-list()
  pvalues<-list()
  i <-1
  for (i in 1:length(features)){
    featureDF<- df %>% filter(bgcName == features[i]) # current feature data 
    kwResult<-kruskal.test(normalizedRPKM ~ GroupAttribute, data = featureDF)
    features_pvalues <- append(features_pvalues, list(features[i]))
    pvalues<-append(pvalues, list(kwResult$p.value))

    i<-i+1
  }
  resultsDF<-do.call(rbind, Map(data.frame, A=features_pvalues, B=pvalues))
  names(resultsDF) <- c("features", "pvalues")
  #resultsDF$BH.adjusted <- p.adjust(resultsDF$pvalues, method="BH", n = length(resultsDF$pvalues)) # adjust pvalues 
  return(resultsDF)
  
}

####################################
quantifier_CD_C_KW <- iterativeKW(quantifier_CD_C,unique(quantifier_CD_C$bgcName)) 
quantifier_CD_C_KW$pvalues[quantifier_CD_C_KW$pvalues == "NaN"]<- "-"

###################################
quantifier_UC_C_KW <- iterativeKW(quantifier_UC_C,unique(quantifier_UC_C$bgcName)) 
quantifier_UC_C_KW$pvalues[quantifier_UC_C_KW$pvalues == "NaN"]<- "-"

###################################
quantifier_UC_CD_KW <- iterativeKW(quantifier_UC_CD,unique(quantifier_UC_CD$bgcName)) 
quantifier_UC_CD_KW$pvalues[quantifier_UC_C_KW$pvalues == "NaN"]<- "-"

###################################
supplementary_tables<-function(df, kw_df,taxa_df){
  
  new_df <- df %>% separate(., BGC, into=c("Sample","ScaffoldName", "bgcLen", "bgcClass", "Software","Coords" ), sep = "__")
  recode_df<- new_df %>% mutate(Sample = str_replace_all(Sample, "_", ".")) %>% mutate(ScaffoldName=stringi::stri_replace_last_fixed(ScaffoldName, '_', '.')) %>% mutate(bgcClass=str_replace_all(bgcClass, '_', '-'))%>%unite(., "BGC",c("Sample","ScaffoldName",  "bgcLen", "bgcClass", "Software","Coords" ), sep = "__")
  recode_df[is.na(recode_df)]<-"-"
  
  supp_table<-recode_df%>% inner_join(.,kw_df, by= c("BGC" = "features"))%>% inner_join(.,taxa_df, by = c("BGC"="BGC_NAME")) %>% select(., 1:6,8,11:12)
  names(supp_table)[5]<-"LEfSe_pvalue"
  names(supp_table)[6]<-"manual_pvalue"
  names(supp_table)[7]<-"NCBI refrence sequence acession number"
  names(supp_table)[8]<-"Taxa Name"
  names(supp_table)[9]<-"Phylum"
  return(supp_table)
  }

lefse_CD_C_supp_table<-supplementary_tables(lefse_CD_C,quantifier_CD_C_KW,species_df)
## Warning: Column `BGC`/`features` joining character vector and factor,
## coercing into character vector
#write_csv(lefse_CD_C_supp_table, "lefse_CD_C_supp_table_1.csv", col_names = T)
lefse_UC_C_supp_table<-supplementary_tables(lefse_UC_C,quantifier_UC_C_KW,species_df)
## Warning: Column `BGC`/`features` joining character vector and factor,
## coercing into character vector
#write_csv(lefse_UC_C_supp_table, "lefse_UC_C_supp_table_2.csv", col_names = T)
lefse_UC_CD_supp_table<-supplementary_tables(lefse_UC_CD,quantifier_UC_CD_KW,species_df)
## Warning: Column `BGC`/`features` joining character vector and factor,
## coercing into character vector
#write_csv(lefse_UC_CD_supp_table, "lefse_UC_CD_supp_table_3.csv", col_names = T)